global root_dir = "`1'"

include "$root_dir/code/config/config.do"


cap noi log using ${log_dir}/6_prepare_employment_data.log, replace name(dat)

*Handle empty arguments
global arg1 = cond("`2'" == "___EMPTY___", "", "`2'")
global arg2 = cond("`3'" == "___EMPTY___", "", "`3'")
global arg3 = cond("`4'" == "___EMPTY___", "", "`4'")
global arg4 = cond("`5'" == "___EMPTY___", "", "`5'")

if "$arg1" != "" {
    global weight_category "$arg1"
    di "Weight category: ${weight_category}"
}

if "$arg2" != "" {
    global weight_versions "$arg2"
    di "Weight versions: ${weight_versions}"
}

if "$arg3" != "" {
    global weight_window "$arg3"
    di "Weight window: ${weight_window}"
}

if "$arg4" != "" {
	global wtype "$arg4"
}
di "${wtype}"
capture noi {


* ---------------------------------------------------*
* Prep BEA VA + total compensation for 1980 and 1997 *
* ---------------------------------------------------*

*** This first section recreates an excel originally done by hand for easier import. Instead of rewriting the whole code we simply do the steps here now.***

*create a program to assign the correct sic codes. This file wirdly sometimes collects different codes together.
* if there are two entries in the inlist statement, the second one refers not to SIC72 but to the SIC 87 classification
cap program drop assign_sic

program assign_sic
    replace sic = "01, 02" if inlist(title, "Farms")
    replace sic = "07, 08, 09" if inlist(title, "Agricultural services, forestry, and fishing")
    replace sic = "10" if inlist(title, "Metal mining")
    replace sic = "11, 12" if inlist(title, "Coal mining")
    replace sic = "13" if inlist(title, "Oil and gas extraction")
    replace sic = "14" if inlist(title, "Nonmetallic minerals, except fuels")
    replace sic = "15, 16, 17" if inlist(title, "Construction")
    replace sic = "24" if inlist(title, "Lumber and wood products")
    replace sic = "25" if inlist(title, "Furniture and fixtures")
    replace sic = "32" if inlist(title, "Stone, clay, and glass products")
    replace sic = "33" if inlist(title, "Primary metal industries")
    replace sic = "34" if inlist(title, "Fabricated metal products")
    replace sic = "35" if inlist(title, "Machinery, except electrical", "Industrial machinery and equipment")
    replace sic = "36" if inlist(title, "Electric and electronic equipment", "Electronic and other electric equipment")
    replace sic = "371" if inlist(title, "Motor vehicles and equipment")
    replace sic = "372, 373, 374, 375, 376, 378, 379" if inlist(title, "Other transportation equipment")
    replace sic = "38" if inlist(title, "Instruments and related products")
    replace sic = "39" if inlist(title, "Miscellaneous manufacturing industries")
    replace sic = "20" if inlist(title, "Food and kindred products")
    replace sic = "21" if inlist(title, "Tobacco products")
    replace sic = "22" if inlist(title, "Textile mill products")
    replace sic = "23" if inlist(title, "Apparel and other textile products")
    replace sic = "26" if inlist(title, "Paper and allied products")
    replace sic = "27" if inlist(title, "Printing and publishing")
    replace sic = "28" if inlist(title, "Chemicals and allied products")
    replace sic = "29" if inlist(title, "Petroleum and coal products")
    replace sic = "30" if inlist(title, "Rubber and miscellaneous plastics products")
    replace sic = "31" if inlist(title, "Leather and leather products")
    replace sic = "40" if inlist(title, "Railroad transportation")
    replace sic = "41" if inlist(title, "Local and interurban passenger transit")
    replace sic = "42" if inlist(title, "Trucking and warehousing")
    replace sic = "44" if inlist(title, "Water transportation")
    replace sic = "45" if inlist(title, "Transportation by air")
    replace sic = "46" if inlist(title, "Pipelines, except natural gas")
    replace sic = "47" if inlist(title, "Transportation services")
    replace sic = "48" if inlist(title, "Communications")
    replace sic = "481,482,489" if inlist(title, "Telephone and telegraph")
    replace sic = "483" if inlist(title, "Radio and television")
    replace sic = "49" if inlist(title, "Electric, gas, and sanitary services")
    replace sic = "50, 51" if inlist(title, "Wholesale trade")
    replace sic = "52, 53, 54, 55, 56, 57, 58, 59" if inlist(title, "Retail trade")
    replace sic = "60" if inlist(title, "Banking", "Depository institutions")
    replace sic = "61" if inlist(title, "Credit agencies other than banks", "Nondepository institutions")
    replace sic = "62" if inlist(title, "Security and commodity brokers")
    replace sic = "63" if inlist(title, "Insurance carriers")
    replace sic = "64" if inlist(title, "Insurance agents, brokers, and service")
    replace sic = "65, 66" if inlist(title, "Real estate /2/", "Real Estate /2/")
    replace sic = "67" if inlist(title, "Holding and other investment offices")
    replace sic = "70" if inlist(title, "Hotels and other lodging places")
    replace sic = "72" if inlist(title, "Personal services")
    replace sic = "73" if inlist(title, "Business services")
    replace sic = "75" if inlist(title, "Auto repair, services, and parking")
    replace sic = "76" if inlist(title, "Miscellaneous repair services")
    replace sic = "78" if inlist(title, "Motion pictures")
    replace sic = "79" if inlist(title, "Amusement and recreation services")
    replace sic = "80" if inlist(title, "Health services")
    replace sic = "81" if inlist(title, "Legal services")
    replace sic = "82" if inlist(title, "Educational services")
    replace sic = "83" if inlist(title, "Social services")
    replace sic = "86" if inlist(title, "Membership organizations")
    replace sic = "84, 89" if inlist(title, "Miscellaneous professional services", "Other services")
    replace sic = "88" if inlist(title, "Private households")
    replace sic = "43, 91, 92, 93, 94, 95, 96, 97" if inlist(title, "Government")
end program

**Combine the BEA data for 1980 and 1997 for compensation and Value added**
* There are some changes to naming of the categories. all four variables of interest are in different sheets...

**load 1980 VA first**
*Bureau of Economic Analysis (BEA). n.d. "Historical Industry Accounts Data." Accessed November 2018. https://www.bea.gov/industry/io-histannual.
import excel using ${alm_data_raw}/bea/GDPbyInd_VA_SIC.xls, sheet("72SIC_VA, GO, II") firstrow clear

*rename variables
foreach var of varlist _all {
    local lbl : variable label `var'
    if `"`lbl'"' != "" {
        rename `var' `=strtoname("`lbl'")'
    }
}

keep if Code == "VA"
keep Industry_Title _1980
rename _1980 va_1980
rename Industry_Title title

*trim spaces at beginning and end of title
replace title = trim(title)
gen sic = ""
*and now a whole bunch of SIC assigning. 
assign_sic
drop if sic == ""

* Convert va_1980 to long, currently is a string
destring va_1980, replace force
*generate a beaind variable that goes from 1 to the number of unique industries, to keep sort order
gen beaind = _n 

tempfile bea_va_1980
save `bea_va_1980', replace


**load 1980 compensation**
*Bureau of Economic Analysis (BEA). n.d. "Historical Industry Accounts Data." Accessed November 2018. https://www.bea.gov/industry/io-histannual.
import excel using ${alm_data_raw}/bea/GDPbyInd_VA_SIC.xls, sheet("72SIC_Components of VA") firstrow clear

*rename variables
foreach var of varlist _all {
    local lbl : variable label `var'
    if `"`lbl'"' != "" {
        rename `var' `=strtoname("`lbl'")'
    }
}
keep if Code == "COMP"
keep Industry_Title _1980
rename Industry_Title title
rename _1980 compensation_1980

*trim spaces at beginning and end of title
replace title = trim(title)
gen sic = ""
*and now a whole bunch of SIC assigning.
assign_sic
drop if sic == ""

destring compensation_1980, replace force

tempfile bea_compensation_1980
save `bea_compensation_1980', replace



** load 1997 VA
*rename the variables
*Bureau of Economic Analysis (BEA). n.d. "Historical Industry Accounts Data." Accessed November 2018. https://www.bea.gov/industry/io-histannual.
import excel using ${alm_data_raw}/bea/GDPbyInd_VA_SIC.xls, sheet("87SIC_VA, GO, II") firstrow clear

*rename variables
foreach var of varlist _all {
    local lbl : variable label `var'
    if `"`lbl'"' != "" {
        rename `var' `=strtoname("`lbl'")'
    }
}
keep if Code == "VA"
keep Industry_Title _1997
rename _1997 va_1997
rename Industry_Title title
*trim spaces at beginning and end of title
replace title = trim(title)
gen sic = ""
*and now a whole bunch of SIC assigning.
assign_sic
drop if sic == ""
destring va_1997, replace force

tempfile bea_va_1997
save `bea_va_1997', replace

**load comp in 1997
*Bureau of Economic Analysis (BEA). n.d. "Historical Industry Accounts Data." Accessed November 2018. https://www.bea.gov/industry/io-histannual.
import excel using ${alm_data_raw}/bea/GDPbyInd_VA_SIC.xls, sheet("87SIC_Components of VA") firstrow clear

*rename variables
foreach var of varlist _all {
    local lbl : variable label `var'
    if `"`lbl'"' != "" {
        rename `var' `=strtoname("`lbl'")'
    }
}
keep if Code == "COMP"
keep Industry_Title _1997
rename Industry_Title title
rename _1997 compensation_1997
*trim spaces at beginning and end of title
replace title = trim(title)
gen sic = ""
*and now a whole bunch of SIC assigning.
assign_sic
drop if sic == ""
destring compensation_1997, replace force

tempfile bea_compensation_1997
save `bea_compensation_1997', replace

use `bea_va_1980', clear
mmerge sic using `bea_compensation_1980'
mmerge sic using `bea_va_1997'
mmerge sic using `bea_compensation_1997'
drop _m

order sic title va_1980 compensation_1980 va_1997 compensation_1997 beaind
sort beaind

*** end of excel recreation***



*sic values are seperated by commas in the cell, so we need to split them
split sic, parse(",")
drop sic
reshape long sic, i(beaind) j(index)
drop index
drop if missing(sic)
reshape long compensation_ va_, i(sic) j(year)
ren (va_ compensation_) (va compensation)

label var sic "SIC industry code"
label var beaind "BEA industry code"
label var title "Industry description"
label var va "Value added"
label var compensation "Total compensation"

save ${alm_data_proc}/beaind_va_compensation_8097.dta, replace

*use the SIC to ALM consistent industry crosswalk
use ${alm_data_proc}/cw_sic4_ind6090.dta, clear

*truncate to get SIC 3
replace sic_3 = substr(sic_3, 1, 3)
drop SIC
ren sic_3 sic
*this keeps only the SIC 3 for radio, motor vehicles and telephone related activities
mmerge sic using ${alm_data_proc}/beaind_va_compensation_8097.dta, unmatched(none)
drop _m
tempfile sic3_beaind 
save `sic3_beaind', replace

*use the SIC to ALM consistent industry crosswalk, this time do the same thing for SIC 2
*should keep basically all other codes not contained before (Banking, Mining, Farming etc)
use ${alm_data_proc}/cw_sic4_ind6090.dta, clear
replace sic_2 = substr(sic_2, 1, 2)
drop SIC
ren sic_2 sic
mmerge sic using ${alm_data_proc}/beaind_va_compensation_8097.dta, unmatched(none)
drop _m
tempfile sic2_beaind 
save `sic2_beaind', replace

*now combine the two and create two new crosswalks to the BEA industry codes
append using `sic3_beaind'
keep sic4 ind6090 title beaind year weight va compensation
sort sic4
list if weight != 1
drop weight
label var sic4 "SIC 4 industry code"
label var ind6090 "ALM industry code"
preserve
keep sic4 beaind
save ${alm_data_proc}/cw_sic4_beaind.dta, replace
restore
preserve
keep ind6090 beaind
save ${alm_data_proc}/cw_ind6090_beaind.dta, replace
restore
keep beaind title year va compensation
duplicates drop
sleep 1000
save ${alm_data_proc}/beaind_va_compensation_8097.dta, replace
sleep 1000

sort beaind year
gen lshare = compensation / va
bys beaind: gen d_lshare = lshare - lshare[_n-1]
label var d_lshare "difference of labor compensation share of value added 80-97"

keep if year == 1997
keep beaind d_lshare title
sort beaind

*save the difference in labor compensation between 1980 and 1997
save ${alm_data_proc}/beaind_lshare_198097.dta, replace


* ---------------------------------------------
* Prepare NBER data
* ---------------------------------------------
* National Bureau of Economic Research (NBER). n.d. "NBER-CES Manufacturing Industry Database." 
*Accessed September 2021. https://www.nber.org/research/data/nber-ces-manufacturing-industry-database.
use ${alm_data_raw}/nber/nberces5818v1_s1987.dta, clear
keep if year >= 1989 & year <= 1998
sort sic year

*sum up investment over period by SIC industry
collapse (sum) invest_sum = invest, by(sic)
tempfile invest_sum_sic
save `invest_sum_sic', replace
tostring sic, gen(sic4)
mmerge sic4 using ${alm_data_proc}/cw_sic4_ind6090.dta, unmatched(master)
collapse (sum) invest_sum [aw=weight], by(ind6090)
tempfile invest_sum_ind6090
save `invest_sum_ind6090', replace

*National Bureau of Economic Research (NBER). n.d. "NBER-CES Manufacturing Industry Database."
* Accessed September 2021. https://www.nber.org/research/data/nber-ces-manufacturing-industry-database.
use ${alm_data_raw}/nber/nberces5818v1_s1987.dta, clear
keep if year == 1980 | year == 1998
sort sic year
preserve
gen lshare = pay/vadd
gen ship = vship/vadd
foreach var in emp lshare ship invest { 
    gen log_`var' = log(`var')
    bys sic: gen dg_`var' = ((`var' - `var'[_n-1])/`var'[_n-1])
    bys sic: gen d_`var' = `var' - `var'[_n-1]
    bys sic: gen dlog_`var' = log_`var' - log_`var'[_n-1]
}
by sic: gen emp_mid = (emp + emp[_n-1])/2
bys year: egen s_emp_mid = total(emp_mid)
gen emp_mid_weight = emp_mid / s_emp_mid
mmerge sic using `invest_sum_sic', unmatched(master)
bys sic: gen invest_sum_emp_mid = invest_sum / emp_mid
keep if year == 1998
keep sic year vadd lshare ship d_ship d_lshare dg_lshare emp log_emp dlog_emp dlog_lshare emp_mid_weight log_invest dlog_invest invest_sum_emp_mid

label var lshare "Labor share"
label var ship "Shipping value to value added"
label var log_emp "Log employment"
label var dlog_emp "Change in log employment"
label var dg_lshare "Growth in labor share"
label var d_lshare "Change in labor share"
label var dlog_lshare "Change in log labor share"
label var d_ship "Change in shipping value to value added"
label var log_invest "Log investment"
label var dlog_invest "Change in log investment"
label var emp_mid_weight "Moving average employment weight"
label var invest_sum_emp_mid "Investment (in M $) per avergae employees"

save ${alm_data_proc}/nber_sic4_198098.dta, replace
restore


tostring sic, gen(sic4)

*merge in the ALM consistent industry codes
mmerge sic4 using ${alm_data_proc}/cw_sic4_ind6090.dta, unmatched(master)

* sum up by year and industry
collapse (sum) emp vadd pay vship invest [aw=weight], by(year ind6090)
sort ind6090 year
gen lshare = pay/vadd
gen ship = vship/vadd
gen invest_emp = invest / emp
foreach var in emp lshare ship invest {
    gen log_`var' = log(`var')
    bys ind6090: gen dg_`var' = ((`var' - `var'[_n-1])/`var'[_n-1])
    bys ind6090: gen d_`var' = `var' - `var'[_n-1]
    bys ind6090: gen dlog_`var' = log_`var' - log_`var'[_n-1]
}
by ind6090: gen invest_emp_mid = (invest_emp + invest_emp[_n-1])/2
by ind6090: gen emp_mid = (emp + emp[_n-1])/2
by ind6090: gen emp_init = emp[_n-1]
by ind6090: gen log_invest_init = log_invest[_n-1]
mmerge ind6090 using `invest_sum_ind6090', unmatched(master)
bys ind6090: gen invest_sum_emp_mid = invest_sum / emp_mid
clonevar emp_end = emp
bys year: egen s_emp_mid = total(emp_mid)
gen emp_mid_weight = emp_mid / s_emp_mid
keep if year == 1998
keep ind6090 vadd year lshare dlog_invest log_invest_init ship d_ship d_lshare dg_lshare log_emp dlog_emp dlog_lshare emp_mid_weight emp_init emp_mid emp_end invest_emp_mid invest_sum_emp_mid

label var lshare "Labor share"
label var ship "Shipping value to value added"
label var log_emp "Log employment"
label var dlog_emp "Change in log employment"
label var dg_lshare "Growth in labor share"
label var d_lshare "Change in labor share"
label var dlog_lshare "Change in log labor share"
label var d_ship "Change in shipping value to value added"
label var log_invest "Log investment"
label var dlog_invest "Change in log investment"
label var emp_mid_weight "Moving average employment weight"
label var invest_sum_emp_mid "Investment (in M $) per average employees"
label var vadd "Value added"
label var invest_emp_mid "Average investment (in M $) per 1000 employees"
label var emp_mid "Moving average employment"
label var emp_init "Initial employment (1980)"
label var emp_end "Final employment (1998)"

save ${alm_data_proc}/nber_ind6090_198098.dta, replace


* ---------------------------------------------
* Prepare trade data
* ---------------------------------------------
*Institution for Social and Policy Studies (ISPS), Yale University. n.d. 
*"Trading Barriers: Immigration and the Remaking of Globalization" Accessed April 2012. https://isps.yale.edu/research/data/d139.
use ${alm_data_raw}/xm_sic87_72_105_20120424.dta, clear
keep if year == 1980 | year == 1998

*collapse to SIC level by year
collapse (sum) x customs cif, by(sic year)
label var x "Exports"
label var customs "Customs value of general imports"
label var cif "Cost, insurance, freight value of general imports"

*merge in consistent industries
sort sic year
tostring sic, gen(sic4)
mmerge sic4 using ${alm_data_proc}/cw_sic4_ind6090.dta, unmatched(master)
collapse (sum) x customs cif [aw=weight], by(ind6090 year)
label var x "Exports (weighted)"
label var customs "Customs value of general imports (weighted)"
label var cif "Cost, insurance, freight value of general imports (weighted)"
label var ind6090 "ALM industry code"

sort ind6090
mmerge ind6090 using ${alm_data_proc}/nber_ind6090_198098.dta

*trade as imports plus exports
gen trade = (cif + x)/vadd
foreach var in trade {
    gen log_`var' = log(`var')
    bys ind6090: gen dg_`var' = ((`var' - `var'[_n-1])/`var'[_n-1])
    bys ind6090: gen d_`var' = `var' - `var'[_n-1]
    bys ind6090: gen dlog_`var' = log_`var' - log_`var'[_n-1]
}
by ind6090: gen trade_init = trade[_n-1]
keep if year == 1998
keep ind6090 trade_init d_trade

label var trade_init "Trade in terms of VA (1980)"
label var d_trade "Change in trade in terms of VA 1980-1998"
save ${alm_data_proc}/trade_ind6090_198098.dta, replace

}
if _rc == 0 {
    display "Execution finished successfully."
}
else {
    display "Execution finished with errors."
}

cap log close dat